from math import log10
import numpy as np
 
# docList is the corpus with each element a doc, each doc is a list of words
def tfidf(docList):
    docNum = len(docList)
    docList = [i.split(' ') for i in docList]
    term_idf = dict()
    for doc in docList:
        # set(doc) 得到每篇文档的词，不包含重复， 即可统计该词在不同文档的出现次数
        for term in set(doc):
            if term not in term_idf:
                term_idf[term] = 1.0
            else:
                term_idf[term] += 1.0
    # IDF 统计词语的逆文档频率
    for term in term_idf:
        # log10  10为底数
        term_idf[term] = log10(docNum / term_idf[term])
    print('all word num = ', len(term_idf))
    # term_tfidf 总词典
    term_tfidf = dict()
    doc_id = 0
    for doc in docList:
        term_tfidf[doc_id] = dict()
        # 每个文档的词频统计
        term_tf = dict()
        for term in doc:
            if term not in term_tf:
                term_tf[term] = 1.0
            else:
                term_tf[term] += 1.0
        # 每个文档的词数目
        docLen = len(doc)
        for term in doc:
            tfidf = term_tf[term] / docLen * term_idf[term]
            term_tfidf[doc_id][term] = tfidf
        doc_id += 1
 
    for voc in term_idf.keys():
        all_word.append(voc)
 
    return term_tfidf
 
 
# with open('demo.txt') as f:
#     data = []
#     for line in f.readlines():
#         if line != '\n':
#             line = line.strip('\n').strip('.[]()')
#             data.append(line)
 
# print(data)
# print('all doc num = ', len(data))
# 词表循环
all_word = []
 
score = tfidf(data)
X = np.zeros((len(data), len(all_word)))
doc_id = 0
 
# 转换为稀疏矩阵
for (d,x) in score.items():
    for (k, v) in x.items():
        if k in all_word:
            X[doc_id][all_word.index(k)] = float(v)
    doc_id += 1
 
# print(X)
 